library(tidyverse) # master library to deal with data frames
library(readxl) # read xlsx or xls files
library(ggrepel) # ggplot add-on, to plot names that don't collapse in same position
library(FactoMineR) # for PCA
library(factoextra) # for PCA
library(here) # usefull to save plots in folders given a root
library(viridis) # color palette package
# library(ComplexHeatmap) # yeah, complex heatmaps
library(plotly)
I’ve downloaded the files from the Zimmermann paper from Nature, 2019. They have a big database of 2099 compounds from drugbank to which they run a Python script to extract the chemical fingerprints, using them to represent the chemical space via a PCA.
# biolog metabolites
biolog = read_csv('Biolog_metabolites.csv')
# drugbank coordinates
drugbank.coord = read.delim('.\\Data_drug_bacteria_gene_mapping\\Input\\drugbank_pca.txt', header = FALSE)
drugbank = read_csv('.\\Data_drug_bacteria_gene_mapping\\Input\\drugbank_approved_MW_150_1000_functional_groups_all.csv') %>% select(-Number_of_carboxylic_acids_1)
# plates to use from Biolog
plates = c("PM11C", "PM12B", "PM13B", "PM14A", "PM15B", "PM16A", "PM17A", "PM18C", "PM19" , "PM20B")
biolog.drug = biolog %>% filter(Plate %in% plates)
# list of drugs present in biolog plates
drugs = biolog.drug %>% select(Metabolite) %>% unique %>% t %>% as.character
Just a glimpse of the drug names:
head(drugs)
## [1] "Amikacin" "Chlortetracycline" "Lincomycin"
## [4] "Amoxicillin" "Cloxacillin" "Lomefloxacin"
And a glimpse of the huge data table from the paper:
head(drugbank)
## # A tibble: 6 x 127
## X1 index ALOGPS_LOGP ALOGPS_LOGS ALOGPS_SOLUBILI~ DATABASE_ID DATABASE_NAME
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 0 10 -0.55 -1.64 5.70e+00 g/l DB00114 drugbank
## 2 1 13 -3.09 -0.4 6.20e+01 g/l DB00117 drugbank
## 3 2 14 0.05 -2.45 1.62e+00 g/l DB00118 drugbank
## 4 3 16 -1.35 -1.6 4.14e+00 g/l DB00120 drugbank
## 5 4 17 0.17 -2.3 1.22e+00 g/l DB00121 drugbank
## 6 5 20 -3.49 -1.88 2.28e+00 g/l DB00125 drugbank
## # ... with 120 more variables: DRUGBANK_ID <chr>, DRUG_GROUPS <chr>,
## # EXACT_MASS <dbl>, FORMULA <chr>, GENERIC_NAME <chr>, ID <chr>,
## # INCHI_IDENTIFIER <chr>, INCHI_KEY <chr>, INTERNATIONAL_BRANDS <chr>,
## # JCHEM_ACCEPTOR_COUNT <dbl>, JCHEM_ATOM_COUNT <dbl>,
## # JCHEM_AVERAGE_POLARIZABILITY <dbl>, JCHEM_BIOAVAILABILITY <dbl>,
## # JCHEM_DONOR_COUNT <dbl>, JCHEM_FORMAL_CHARGE <dbl>,
## # JCHEM_GHOSE_FILTER <dbl>, JCHEM_IUPAC <chr>, JCHEM_LOGP <dbl>,
## # JCHEM_MDDR_LIKE_RULE <dbl>, JCHEM_NUMBER_OF_RINGS <dbl>,
## # JCHEM_PHYSIOLOGICAL_CHARGE <dbl>, JCHEM_PKA <dbl>,
## # JCHEM_PKA_STRONGEST_ACIDIC <dbl>, JCHEM_PKA_STRONGEST_BASIC <dbl>,
## # JCHEM_POLAR_SURFACE_AREA <dbl>, JCHEM_REFRACTIVITY <dbl>,
## # JCHEM_ROTATABLE_BOND_COUNT <dbl>, JCHEM_RULE_OF_FIVE <dbl>,
## # JCHEM_TRADITIONAL_IUPAC <chr>, JCHEM_VEBER_RULE <dbl>,
## # MOLECULAR_WEIGHT <dbl>, PRODUCTS <chr>, SALTS <chr>,
## # SECONDARY_ACCESSION_NUMBERS <chr>, SMILES <chr>, SYNONYMS <chr>,
## # Number_of_aliphatic_carboxylic_acids <dbl>,
## # Number_of_aliphatic_hydroxyl_groups <dbl>,
## # Number_of_aliphatic_hydroxyl_groups_excluding_tert_OH <dbl>,
## # Number_of_N_functional_groups_attached_to_aromatics <dbl>,
## # Number_of_Aromatic_carboxylic_acide <dbl>,
## # Number_of_aromatic_nitrogens <dbl>, Number_of_aromatic_amines <dbl>,
## # Number_of_aromatic_hydroxyl_groups <dbl>, Number_of_carboxylic_acids <dbl>,
## # Number_of_carbonyl_O <dbl>, Number_of_carbonyl_O__excluding_COOH <dbl>,
## # Number_of_thiocarbonyl <dbl>,
## # Number_of_C_OH_CCN_Ctert_alkyl_or_C_OH_CCNcyclic <dbl>,
## # Number_of_Imines <dbl>, Number_of_Tertiary_amines <dbl>,
## # Number_of_Secondary_amines <dbl>, Number_of_Primary_amines <dbl>,
## # Number_of_hydroxylamine_groups <dbl>, Number_of_XCCNR_groups <dbl>,
## # Number_of_tert_alicyclic_amines__no_heteroatoms__not_quinine_like_bridged_N_ <dbl>,
## # Number_of_H_pyrrole_nitrogens <dbl>, Number_of_thiol_groups <dbl>,
## # Number_of_aldehydes <dbl>,
## # Number_of_alkyl_carbamates__subject_to_hydrolysis_ <dbl>,
## # Number_of_alkyl_halides <dbl>,
## # Number_of_allylic_oxidation_sites_excluding_steroid_dienone <dbl>,
## # Number_of_amides <dbl>, Number_of_amidine_groups <dbl>,
## # Number_of_anilines <dbl>,
## # Number_of_aryl_methyl_sites_for_hydroxylation <dbl>,
## # Number_of_azide_groups <dbl>, Number_of_azo_groups <dbl>,
## # Number_of_barbiturate_groups <dbl>, Number_of_benzene_rings <dbl>,
## # Number_of_benzodiazepines_with_no_additional_fused_rings <dbl>,
## # Bicyclic <dbl>, Number_of_diazo_groups <dbl>,
## # Number_of_dihydropyridines <dbl>, Number_of_epoxide_rings <dbl>,
## # Number_of_esters <dbl>, Number_of_ether_oxygens__including_phenoxy_ <dbl>,
## # Number_of_furan_rings <dbl>, Number_of_guanidine_groups <dbl>,
## # Number_of_halogens <dbl>, Number_of_hydrazine_groups <dbl>,
## # Number_of_hydrazone_groups <dbl>, Number_of_imidazole_rings <dbl>,
## # Number_of_imide_groups <dbl>, Number_of_isocyanates <dbl>,
## # Number_of_isothiocyanates <dbl>, Number_of_ketones <dbl>,
## # Number_of_ketones_excluding_diaryl__a_b_unsat__dienones__heteroatom_on_Calpha <dbl>,
## # Number_of_beta_lactams <dbl>, Number_of_cyclic_esters__lactones_ <dbl>,
## # Number_of_methoxy_groups__OCH3 <dbl>, Number_of_morpholine_rings <dbl>,
## # Number_of_nitriles <dbl>, Number_of_nitro_groups <dbl>,
## # Number_of_nitro_benzene_ring_substituents <dbl>,
## # Number_of_non_ortho_nitro_benzene_ring_substituents <dbl>,
## # Number_of_nitroso_groups__excluding_NO2 <dbl>,
## # Number_of_oxazole_rings <dbl>, Number_of_oxime_groups <dbl>,
## # Number_of_para_hydroxylation_sites <dbl>, ...
Doing some data transformations (including the names of biolog compounds and that into the huge data table), we notice one important thing: only ~70 compounds from biolog are in the original data table. Anyway, it’s a good practice to see if our drugs are occupying enough of the drug space.
drugbank.coord['Drugbank'] = drugbank$GENERIC_NAME
# create a new variable, just in case
test2 = drugbank.coord
# data transformation, create Category with Biolog (if the compound is Drugbank) and Drugbank (if not)
test2 = test2 %>%
mutate(biolog = ifelse(Drugbank %in% drugs, Drugbank, NA),
Category = ifelse(Drugbank %in% drugs, 'Biolog', 'Drugbank'),
Category = as.factor(Category))
rownames(test2) = test2$Drugbank
head(test2)
## V1 V2 V3 Drugbank biolog Category
## Pyridoxal Phosphate -0.452 -0.801 0.028 Pyridoxal Phosphate <NA> Drugbank
## Histidine -0.566 -0.949 0.606 Histidine <NA> Drugbank
## Ademetionine 0.375 -0.703 0.468 Ademetionine <NA> Drugbank
## L-Phenylalanine -0.982 -0.417 1.061 L-Phenylalanine <NA> Drugbank
## Biotin 0.686 -0.869 0.437 Biotin <NA> Drugbank
## L-Arginine -0.376 -1.576 1.169 L-Arginine <NA> Drugbank
And let’s plot it in 3D with Plotly:
fig <- plot_ly(data = test2, x = ~V1, y = ~V2, z = ~V3, text = rownames(test2),
color = ~Category, colors = c('#FF0900', '#B8B8B8'),
marker = list(size = 3))
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
In order to try to complete our dataset with more drugs, I’ve extracted the KEGG IDs from the biolog table, passed them into a website that converts them into PubChem IDs, and got the chemical fingerprints using an adapted version of the Python script available at their GitHub repository.
# first, separate KEGG ids to generate a new column to the biolog data frame with pubchem ids
kegg_ids = biolog %>%
select(KEGG_ID) %>% t %>% as.character %>% unique
# remove NAs
kegg_ids = kegg_ids[complete.cases(kegg_ids)]
# write list of genes
write.table(kegg_ids, 'KEGG_IDs_biolog.txt', quote = F, col.names = F, row.names = F)
## go here to convert: http://csbg.cnb.csic.es/mbrole/conversion.jsp
# read generated list and merge with biolog
kegg2pub = read.csv("D:/MRC_Postdoc/Pangenomic/Chem_space/KEGG2PubChemIDs.txt")
biolog = biolog %>% left_join(kegg2pub)
### let's get the missing drugs (as many as we can, at least)
missing = drugs[!drugs %in% unique(drugbank$GENERIC_NAME)]
drugs_missing = biolog %>%
filter(Metabolite %in% missing) %>%
select(PubChem_ID) %>% t %>% as.character %>% unique
# remove NAs
drugs_missing = drugs_missing[complete.cases(drugs_missing)]
write.table(drugs_missing, 'PubChem_ID_missing_metabolites.txt', quote = F, col.names = F, row.names = F)
# after downloading them from pubchem, the file is named as: Metab_structures_missing_biolog.sdf
# file with chem fingerprints from pubchem compounds (~100 more, not bad)
biolog_metabolites_PubChem = read_csv("biolog_metabolites_PubChem.csv") %>% select(-X1, -index, -Number_of_carboxylic_acids_1, -ID) %>%
rename(PubChem_ID = PUBCHEM_COMPOUND_CID)
dummy = biolog %>%
filter(Metabolite %in% missing) %>%
select(Metabolite, PubChem_ID) %>% unique
biolog_metabolites_PubChem = biolog_metabolites_PubChem %>% left_join(dummy, by = 'PubChem_ID') %>%
select(PubChem_ID, Metabolite, everything()) %>% rename(GENERIC_NAME = Metabolite)
# bind rows
complete = bind_rows(drugbank, biolog_metabolites_PubChem)
# save data table into csv
write.csv(complete, 'Complete_chem_fingerprints.csv')
Do the PCA:
mat = complete %>%
filter(GENERIC_NAME != 'Tannic acid') %>%
select(Number_of_aliphatic_carboxylic_acids:Number_of_urea_groups) %>% as.matrix
rownames(mat) = complete %>% filter(GENERIC_NAME != 'Tannic acid') %>% select(GENERIC_NAME) %>% t %>% as.character
res.pca = PCA((mat), scale.unit = TRUE, ncp = 5, graph = F)
ind = get_pca_ind(res.pca)
ind_df = data.frame(ind$coord[,1], ind$coord[,2], ind$coord[,3])
colnames(ind_df) = c('Dim1', 'Dim2', 'Dim3')
And now, plot again:
test2 = ind_df %>% tibble
test2['Drugbank'] = complete %>% filter(GENERIC_NAME != 'Tannic acid') %>% select(GENERIC_NAME) %>% t %>% as.character
test2 = test2 %>%
mutate(biolog = ifelse(Drugbank %in% drugs, Drugbank, NA),
Category = ifelse(Drugbank %in% drugs, 'Biolog', 'Drugbank'),
Category = as.factor(Category))
fig <- plot_ly(data = test2, x = ~Dim1, y = ~Dim2, z = ~Dim3, text = test2$Drugbank,
color = ~Category, colors = c('#FF0900', '#B8B8B8'),
marker = list(size = 3))
fig
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
Notice that I was removing Tanic Acid from the plot because it’s a super different compound that gets extremely separated from the rest. In any case, there is a major difference between this plot and the previous one because the cloud of points have a different shape. I don’t think this is very important, but we need to have this in mind.